This document contains a reproducible query that searches for the terms “disinformation” and “misinformation” in the titles and abstracts of peer-reviewed articles published between 2000 and 2025. The data are retrieved from Semantic Scholar API and cleaned for encoding errors and duplicate results.
Two tables are constructed with the results:
Annual Publication Counts: A summary table displaying the count of articles containing “disinformation” compared to “misinformation” for each year, alongside the total article count.
Top Cited Literature: A list of the most influential papers retrieved, ranked by citation count and including metadata such as authors, year, and Semantic Scholar URL.
library(httr2)
library(purrr)
library(dplyr)
library(tibble)
library(stringi)
library(DT)
library(stringr)
library(knitr)
library(kableExtra)
# UTF-8 text normalizaton
sanitize_utf8 <- function(x) {
stringi::stri_enc_toutf8(x, is_unknown_8bit = TRUE)
}
# API Query Builder
search_semantic_scholar <- function(query,
offset = 0,
limit = 100,
year_min = 2000,
year_max = 2025) {
# Validate API key
api_key <- Sys.getenv("SS_API_KEY")
if (api_key == "") {
stop("Semantic Scholar API key not found in environment variable SS_API_KEY")
}
# Time constraints
limit <- min(limit, 100) # Hard cap on results per request
offset <- max(offset, 0)
resp <- tryCatch(
{
request("https://api.semanticscholar.org/graph/v1/paper/search") %>%
req_headers(
"x-api-key" = api_key,
"Accept" = "application/json") %>%
req_url_query(
query = query,
offset = offset,
limit = limit,
year = paste0(year_min, "-", year_max),
fields = paste(
c("title",
"abstract",
"authors.name",
"year",
"citationCount",
"url"),
collapse = ",")) %>%
req_retry(
max_tries = 3,
backoff = function(i) 2 ^ i) %>% # Exponential back off
req_perform() %>%
resp_body_json()
},
error = function(e) NULL)
# Defensive logic
if (is.null(resp) || is.null(resp$data) || length(resp$data) == 0) {
return(tibble(
abstract = character(),
title = character(),
authors = character(),
year = integer(),
citations = integer(),
link = character()))
}
# Normal success path
tibble(
abstract = sanitize_utf8(
map_chr(resp$data, "abstract", .default = NA_character_)),
title = sanitize_utf8(
map_chr(resp$data, "title", .default = NA_character_)),
authors = sanitize_utf8(
map_chr(
resp$data,
~ paste(map_chr(.x$authors, "name"), collapse = ", "),
.default = NA_character_)),
year = map_int(resp$data, "year", .default = NA_integer_),
citations = map_int(resp$data, "citationCount", .default = NA_integer_),
link = map_chr(resp$data, "url", .default = NA_character_))
}
# Rate-Limited Paginated Data Retrieval
# Retrieval query definitions
queries <- c(
"misinformation",
"disinformation")
# Safe paginated fetch for one query
fetch_query_pages <- function(query,
year_min = 2000,
year_max = 2025,
limit = 100,
sleep_rng = c(1.5, 3)) {
offset <- 0
out <- list()
repeat {
Sys.sleep(runif(1, sleep_rng[1], sleep_rng[2]))
res <- search_semantic_scholar(
query = query,
offset = offset,
limit = limit,
year_min = year_min,
year_max = year_max)
# stop when API returns nothing
if (nrow(res) == 0) break
out[[length(out) + 1]] <- res %>%
mutate(retrieval_query = query,
retrieval_offset = offset)
offset <- offset + limit
}
bind_rows(out)
}
# Run across all queries
results_raw <- map(
queries,
fetch_query_pages,
year_min = 2000,
year_max = 2025) %>%
list_rbind()
# Output 1: Year-level article counts table
# A table with one row per year and columns:
# - Year published
# - Articles with mentions of 'disinformation' in title or abstract
# - Articles with mentions of 'misinformation' in title or abstract
# - Total number of articles
article_counts <- results_raw %>%
distinct(title, year, .keep_all = TRUE) %>% # ← ADD HERE
mutate(
text = str_to_lower(paste(title, coalesce(abstract, ""))),
has_disinfo = str_detect(text, "disinformation"),
has_misinfo = str_detect(text, "misinformation")) %>%
group_by(year) %>%
summarise(
disinformation_titles = sum(has_disinfo, na.rm = TRUE),
misinformation_titles = sum(has_misinfo, na.rm = TRUE),
total_titles = n(),
.groups = "drop") %>%
arrange(desc(year))
article_counts_dt <- article_counts %>%
select(
year,
disinformation_titles,
misinformation_titles,
total_titles) %>%
as.data.frame()
DT::datatable(
article_counts_dt,
rownames = FALSE,
extensions = "Buttons",
options = list(
pageLength = 15,
autoWidth = TRUE,
order = list(list(0, "desc")),
dom = "Bfrtip",
buttons = c("copy", "csv", "excel"),
columnDefs = list(
list(className = "dt-center", targets = "_all"))),
colnames = c(
"Year",
"Articles mentioning 'Disinformation'",
"Articles mentioning 'Misinformation'",
"Total Articles"))
# Output 2:
# One row per paper:
# - Title
# - Authors
# - Year
# - Citation Count
# - Semantic Scholar URL
results_display <- results_raw %>%
distinct(title, year, .keep_all = TRUE) %>%
mutate(
text = str_to_lower(paste(title, coalesce(abstract, ""))),
has_disinformation = str_detect(text, "disinformation"),
has_misinformation = str_detect(text, "misinformation")) %>%
arrange(desc(citations)) %>%
transmute(
title,
authors,
year,
citations,
link = sprintf('<a href="%s" target="_blank">link</a>', link))
DT::datatable(
results_display,
escape = FALSE,
rownames = FALSE,
extensions = "Buttons",
options = list(
pageLength = 5,
autoWidth = FALSE,
dom = "Bfrtip",
buttons = c("copy", "csv", "excel"),
order = list(list(3, "desc")), # citations
columnDefs = list(
list(width = "450px", targets = 0), # title
list(width = "200px", targets = 1), # authors
list(width = "80px", targets = 2), # year
list(width = "90px", targets = 3), # citations
list(width = "70px", targets = 4))), # link
colnames = c(
"Title",
"Authors",
"Year",
"Citations",
"Link"))